library(Matrix)
library(MASS)
library(fda)
library(lme4)
library(pedigreemm)
library(mvnfast)
library(ggplot2)
library(plotly)

Functional Mixed-Effect Model

\[ \bf{Y} = \bf{Z^G \alpha} + \bf{Z^E \gamma} + \bf{\epsilon}\] with the following distribution of the random vectors: \[\begin{align*} \alpha &\sim N(\bf{0}, \bf{A \otimes C^G})\\ \gamma &\sim N(\bf{0}, \bf{I_N \otimes C^G})\\ \epsilon & \sim N(\bf{0}, \sigma^2_{res} *\bf{I_n}) \end{align*}\] assuming there are \(N\) individuals and total \(n\) measurements.

For the simulation study, we will fix a basis \(\phi(t)\) for our functional space and true covariance matrices \(\bf{C^G}\) and \(\bf{C^E}\). Therefore the true genetic and environmental covariance functions which will be estimated are: \[\begin{align*} G(t,s) & = \phi(t)^T \ast \bf{C^G} \ast \phi(t)\\ E(t,s) & = \phi(t)^T \ast \bf{C^E} \ast \phi(t) \end{align*}\] Then we will generate a set of responses and fit the mixed-effect model to get the estimated covariance functions, which then will be compared with the true ones.

Simulation process

Step 1: Fix functional basis, covariance matrices and residual variance. Here we will use 5 cubic B-spline basis.

### use b-spline basis
basisObj <- create.bspline.basis(c(0,1), nbasis = 5, norder = 4)
### genetic covariance matrix
C_gen <- matrix(c(750, 10 ,130, 80, 250,
              10, 800, 30, 15, 40,
              130, 30, 700, 50, 130,
              80, 15, 50, 420, 50,
              250, 40, 130, 50, 330), nrow = 5, byrow = T)
### environmental covariance matrix
C_env <- matrix(c(750, 10 ,130, 80, 250,
              10, 800, 30, 15, 40,
              130, 30, 700, 50, 130,
              80, 15, 50, 420, 50,
              250, 40, 130, 50, 330), nrow = 5, byrow = T)
### residual variance
sigma2 <- 50

Step 2: Set the distribution of the random effect vectors \(\bf{\alpha}\) and \(\bf{\gamma}\).

Remark: we will assume in total there are 873 individuals so the same genetic relationship matrix \(A\) computed in developing our mixed-effect model will be used in our simulation study. For now, assume each individual has 10 regular sampling points from the unit interval. We will use 3 (or 4) principal components to re-construct the covariance functions.

### reparameterised genetic covariance 
C_gen_para <- as(kronecker(A, C_gen), "dgCMatrix") 

### reparameterised environmental covariance 
I_N <- as(diag(873), "dgCMatrix")
C_env_para <- as(kronecker(I_N, C_env), "dgCMatrix")

### distribution of the genetic random vector
mu <- rep(0, dim(C_gen_para)[1])
alpha <- rmvn(n=50, mu = mu, sigma = C_gen_para) # here we generate 50 random vectors from multivariate normal distribution
gamma <- rmvn(n=50, mu = mu, sigma = C_env_para)

### distribution of the error vector
mu_res <- rep(0, 8730)
I_n <- as(diag(8730), "dgCMatrix")
res_cov <- as(sigma2 * I_n, "dgCMatrix") 
  
res <- rmvn(n=50, mu=mu_res, sigma = res_cov) # error vector

Step 3: Use the modular functions of lme4() to compute the random effect design matrices.

uniqueIds <- seq(1,873, length=873)
id <- rep(uniqueIds, each=10)
time_rang <- seq(0,1,length=10)
basis <- eval.basis(time_rang, basisObj)
b1 <- rep(basis[,1], times = 873)
b2 <- rep(basis[,2], times = 873)
b3 <- rep(basis[,3], times = 873)
b4 <- rep(basis[,4], times = 873)
b5 <- rep(basis[,5], times = 873)

y_pre <- rep(0, 8730) # dummy response values, will be updated 
df_pre <- data.frame(id = id, y_pre = y_pre, b1 = b1, b2 = b2, b3 = b3, b4 = b4, b5 = b5)

parsedFormula <- y_pre ~ (-1 + df_pre$b1 + df_pre$b2 + df_pre$b3 + df_pre$b4 + df_pre$b5 | df_pre$id) + 
  (-1 + df_pre$b1 + df_pre$b2 + df_pre$b3 + df_pre$b4 + df_pre$b5 | df_pre$id)

Z_pre <- t(lFormula(formula=parsedFormula, data=df_pre)$reTrms$Zt)

### update the genetic covariance matrix
L <- as(t(chol(A)), "dgCMatrix")
I5 <- as(diag(5), "dgCMatrix")
M <- kronecker(L, I5)
ZE <- Z_pre[,1:4365] 
ZG <- Z_pre[,1:4365] %*% M 
df_pre$y_pre <- as.vector(ZG %*% alpha[1,] + ZE %*% gamma[1,] + res[1,]) # random effects + residual

Step 4: Use lm() to calculate the fixed-effect.

f <- lm(y_pre ~ -1 + df_pre$b1 + df_pre$b2 + df_pre$b3 + df_pre$b4 + df_pre$b5, data = df_pre)
fix_ef <- f$coefficients[1] * df_pre$b1 + f$coefficients[2] * df_pre$b2 + f$coefficients[3] * df_pre$b3 +
  f$coefficients[4] * df_pre$b4 + f$coefficients[5] + df_pre$b5

Step 5: Generate response and reform into a dataframe for model-fitting.

response <- as.vector(fix_ef + df_pre$y_pre) 
df_simu <- data.frame(id = id, response = response)

Fit simulated data

## Data smoothing
timefine <- seq(0,1,length=25) # dense time grid
y_hat <- matrix(0, 25, 873)
lambda <- rep(0, 873)
for (i in 1:873){
  ss <- smooth.spline(time_rang, y_list[[i]], cv = FALSE)
  y_hat[,i] <- predict(ss, timefine)$y
  lambda[i] <- ss$lambda
}

matplot(timefine, y_hat, col = 1:873, type= "l") ### smoothed curves

## FPCA
fpcaobj <- prcomp(x=t(y_hat), retx = TRUE, center = TRUE, rank. = 4)
pcs <- fpcaobj$rotation # eigen vectors as basis functions for moedel-fitting

summary(pcs)
##       PC1                PC2                PC3                PC4          
##  Min.   :-0.31543   Min.   :-0.31034   Min.   :-0.42098   Min.   :-0.54873  
##  1st Qu.:-0.26880   1st Qu.:-0.08898   1st Qu.:-0.22864   1st Qu.:-0.12050  
##  Median :-0.17205   Median : 0.10105   Median :-0.04158   Median : 0.01714  
##  Mean   :-0.17514   Mean   : 0.05059   Mean   :-0.08218   Mean   :-0.00331  
##  3rd Qu.:-0.09028   3rd Qu.: 0.23144   3rd Qu.: 0.08326   3rd Qu.: 0.13593  
##  Max.   :-0.02831   Max.   : 0.27498   Max.   : 0.14811   Max.   : 0.43943
pcs[,1] %*% pcs[,2]
##              [,1]
## [1,] 9.367507e-17
## Combine the smoothed data and basis functions to a dataframe
subjectID <- rep(unique(df_simu$id), each=25)
response_test <- c(y_hat)
basis1 <- rep(pcs[,1], times = 873)
basis2 <- rep(pcs[,2], times = 873)
basis3 <- rep(pcs[,3], times = 873)
basis4 <- rep(pcs[,4], times = 873)

df_test <- data.frame(id = subjectID, y= response_test, phi1 = basis1, phi2 = basis2, phi3 = basis3, phi4 = basis4) 

## Fit FMEM
fform <- y ~ -1 + df_test$phi1 + df_test$phi2 + df_test$phi3 + 
  (-1 + df_test$phi1 + df_test$phi2 + df_test$phi3 | df_test$id) + 
  (-1 + df_test$phi1 + df_test$phi2 + df_test$phi3 | df_test$id)

system.time(
 ft <-  fit_genetic_fmm(fform, df_test, A, pcs[,1:3])
) # user   system   elapsed
##   用户   系统   流逝 
## 175.73   8.79 271.32
summary(ft)
## Linear mixed model fit by REML ['lmerMod']
## 
## REML criterion at convergence: 176501.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.8032 -0.5044  0.0076  0.5180  7.3512 
## 
## Random effects:
##  Groups       Name         Variance Std.Dev. Corr       
##  df_test.id   df_test$phi1 10698.65 103.434             
##               df_test$phi2  3978.43  63.075  -0.15      
##               df_test$phi3  3486.79  59.049   0.25 -0.09
##  df_test.id.1 df_test$phi1   673.95  25.961             
##               df_test$phi2    82.06   9.059  -1.00      
##               df_test$phi3   277.50  16.658   1.00 -1.00
##  Residual                    127.08  11.273             
## Number of obs: 21825, groups:  df_test$id, 873
## 
## Fixed effects:
##              Estimate Std. Error t value
## df_test$phi1   38.464     11.049   3.481
## df_test$phi2  -21.163      6.729  -3.145
## df_test$phi3   20.376      6.323   3.222
## 
## Correlation of Fixed Effects:
##             df_t$1 df_t$2
## df_test$ph2 -0.156       
## df_test$ph3  0.252 -0.092
# Extract covariance function
vc <- VarCorr(ft)
CG <- vc[["df_test.id"]] ## genetic covariance
CE <- vc[["df_test.id.1"]] ## environmental covariance


### Convert to genetic covariance function
CG_fun <- pcs[,1:3] %*% CG %*% t(pcs[,1:3])
### environmental covariance function
CE_fun <- pcs[,1:3] %*% CE %*% t(pcs[,1:3])
### Phenotypic covariance function
P_fun <- CG_fun + CE_fun

Covariance estimation when 4 PCs used as basis functions

## Fit FMEM
fform1 <- y ~ -1 + df_test$phi1 + df_test$phi2 + df_test$phi3 + df_test$phi4 + 
  (-1 + df_test$phi1 + df_test$phi2 + df_test$phi3 + df_test$phi4 | df_test$id) + 
  (-1 + df_test$phi1 + df_test$phi2 + df_test$phi3 + df_test$phi4 | df_test$id)

system.time(
 ft1 <-  fit_genetic_fmm(fform1, df_test, A, pcs)
) # user   system   elapsed
##    用户    系统    流逝 
##  867.79   34.80 2420.16
summary(ft1)
## Linear mixed model fit by REML ['lmerMod']
## 
## REML criterion at convergence: 161955.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.1931 -0.4539  0.0046  0.4551  4.7763 
## 
## Random effects:
##  Groups       Name         Variance Std.Dev. Corr             
##  df_test.id   df_test$phi1 10648.81 103.193                   
##               df_test$phi2  4090.85  63.960  -0.15            
##               df_test$phi3  3483.33  59.020   0.23 -0.08      
##               df_test$phi4   985.57  31.394   0.04 -0.09 -0.09
##  df_test.id.1 df_test$phi1   765.85  27.674                   
##               df_test$phi2    84.21   9.176  -1.00            
##               df_test$phi3   347.04  18.629   1.00 -1.00      
##               df_test$phi4     8.22   2.867  -1.00  1.00 -1.00
##  Residual                     53.05   7.284                   
## Number of obs: 21825, groups:  df_test$id, 873
## 
## Fixed effects:
##              Estimate Std. Error t value
## df_test$phi1   38.511     11.024   3.493
## df_test$phi2  -21.180      6.815  -3.108
## df_test$phi3   20.423      6.320   3.232
## df_test$phi4  -14.236      3.351  -4.248
## 
## Correlation of Fixed Effects:
##             df_t$1 df_t$2 df_t$3
## df_test$ph2 -0.151              
## df_test$ph3  0.233 -0.082       
## df_test$ph4  0.032 -0.087 -0.090
# Extract covariance function
vc1 <- VarCorr(ft1)
CG1 <- vc1[["df_test.id"]] ## genetic covariance
CE1 <- vc1[["df_test.id.1"]] ## environmental covariance


### Convert to genetic covariance function
CG_fun1 <- pcs %*% CG1 %*% t(pcs)
### environmental covariance function
CE_fun1 <- pcs %*% CE1 %*% t(pcs)
### Phenotypic covariance function
P_fun1 <- CG_fun1 + CE_fun1